For the final/midterm project, I wish to use the real breast cancer dataset on kaggle to perform analysis on breast cancer status and its relations with 4 kinds of proteins, tumor stages, histology, and so on. I also want to merge the dataset with other kinds of sequential data (over time) to predict patient statuses more dynamically.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(ggplot2)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
le <- read.csv("/Users/masonhyz/Downloads/life-expectancy-of-women-vs-life-expectancy-of-men.csv")
ac <- read.csv("/Users/masonhyz/Downloads/WHOAlcoholTotalPerCapita_2021-09-20v2.csv")
tidy_le <- le %>%
pivot_longer(
cols = starts_with("Life.expectancy"),
names_to = c(".value", "Sex"),
names_sep = "...Sex..",
values_to = "Life_expectancy"
) %>%
mutate(Sex = case_when(
Sex == "female...Age..at.birth...Variant..estimates" ~ "Female",
Sex == "male...Age..at.birth...Variant..estimates" ~ "Male",
TRUE ~ as.character(Sex)
)) %>% na.omit()
names(tidy_le)[names(tidy_le) == "Population...Sex..all...Age..all...Variant..estimates"] <- "Population"
names(tidy_le)[names(tidy_le) == "Entity"] <- "Country"
head(tidy_le)
## # A tibble: 6 × 7
## Country Code Year Population Continent Sex Life.expectancy
## <chr> <chr> <int> <dbl> <chr> <chr> <dbl>
## 1 Afghanistan AFG 1950 7480464 "" Female 28.4
## 2 Afghanistan AFG 1950 7480464 "" Male 27.1
## 3 Afghanistan AFG 1951 7571542 "" Female 28.6
## 4 Afghanistan AFG 1951 7571542 "" Male 27.4
## 5 Afghanistan AFG 1952 7667534 "" Female 29.1
## 6 Afghanistan AFG 1952 7667534 "" Male 27.8
tidy_ac <- ac %>% filter(Sex != "Both sexes")
names(tidy_ac)[names(tidy_ac) == "Alcohol.total.per.capita..15...consumption.in.liters..numeric."] <- "Liter.per.capita"
names(tidy_ac)[names(tidy_ac) == "Alcohol.total.per.capita..15...consumption.in.liters..low.estimation."] <- "Liter.per.capita.low"
names(tidy_ac)[names(tidy_ac) == "Alcohol.total.per.capita..15...consumption.in.liters..high.estimation."] <- "Liter.per.capita.high"
names(tidy_ac)[names(tidy_ac) == "Alcohol.total.per.capita..15...consumption.in.liters..string."] <- "Liter.per.capita.string"
acle <- inner_join(tidy_ac, tidy_le, by = c("Country", "Year", "Sex"))
acle_summary <- acle %>%
group_by(Year, Sex) %>%
summarise(
mean_life_expectancy = mean(Life.expectancy, na.rm = TRUE),
sd_life_expectancy = sd(Life.expectancy, na.rm = TRUE),
mean_liter_per_capita = mean(Liter.per.capita, na.rm = TRUE),
sd_liter_per_capita = sd(Liter.per.capita, na.rm = TRUE),
.groups = 'keep'
)
kable(acle_summary, format = "html", caption = "Summary of Life Expectancy and Alcohol Consumption")
| Year | Sex | mean_life_expectancy | sd_life_expectancy | mean_liter_per_capita | sd_liter_per_capita |
|---|---|---|---|---|---|
| 2000 | Female | 69.14036 | 10.571545 | 2.523819 | 2.164264 |
| 2000 | Male | 64.09157 | 9.526031 | 9.235145 | 7.007668 |
| 2001 | Female | 69.44096 | 10.645375 | 2.523819 | 2.164264 |
| 2001 | Male | 64.38735 | 9.517500 | 9.235145 | 7.007668 |
| 2002 | Female | 69.71325 | 10.525493 | 2.518374 | 2.159343 |
| 2002 | Male | 64.67289 | 9.443737 | 9.205151 | 6.992798 |
| 2003 | Female | 69.97952 | 10.466718 | 2.518379 | 2.146989 |
| 2003 | Male | 64.95482 | 9.358519 | 9.192428 | 6.937405 |
| 2004 | Female | 70.26145 | 10.352038 | 2.534066 | 2.126814 |
| 2004 | Male | 65.28072 | 9.277676 | 9.236566 | 6.852761 |
| 2005 | Female | 70.67410 | 10.235689 | 2.568946 | 2.141408 |
| 2005 | Male | 65.61988 | 9.186790 | 9.332512 | 6.862059 |
| 2006 | Female | 71.08133 | 10.076397 | 2.602223 | 2.159834 |
| 2006 | Male | 65.98976 | 9.091239 | 9.415946 | 6.897050 |
| 2007 | Female | 71.45179 | 9.862309 | 2.666607 | 2.204951 |
| 2007 | Male | 66.36250 | 8.897687 | 9.623006 | 7.063432 |
| 2008 | Female | 71.78690 | 9.729094 | 2.628405 | 2.169372 |
| 2008 | Male | 66.72679 | 8.811584 | 9.498554 | 6.967984 |
| 2009 | Female | 72.17024 | 9.494250 | 2.589732 | 2.104763 |
| 2009 | Male | 67.11310 | 8.632778 | 9.402869 | 6.799430 |
| 2010 | Female | 72.43036 | 9.476348 | 2.558833 | 2.043933 |
| 2010 | Male | 67.42976 | 8.645822 | 9.328167 | 6.627876 |
| 2011 | Female | 72.88690 | 9.043041 | 2.573530 | 2.030884 |
| 2011 | Male | 67.84762 | 8.360751 | 9.399952 | 6.609517 |
| 2012 | Female | 73.19643 | 8.772206 | 2.574446 | 2.021947 |
| 2012 | Male | 68.20714 | 8.196615 | 9.408821 | 6.597133 |
| 2013 | Female | 73.52976 | 8.599322 | 2.565054 | 2.001680 |
| 2013 | Male | 68.50298 | 8.075619 | 9.392661 | 6.546354 |
| 2014 | Female | 73.83810 | 8.482746 | 2.545667 | 1.973115 |
| 2014 | Male | 68.75774 | 8.014385 | 9.340637 | 6.457291 |
| 2015 | Female | 74.04524 | 8.259147 | 2.534435 | 1.953997 |
| 2015 | Male | 68.97917 | 7.839031 | 9.318810 | 6.406935 |
| 2016 | Female | 74.39524 | 8.108581 | 2.522512 | 1.934788 |
| 2016 | Male | 69.27083 | 7.707702 | 9.292619 | 6.362486 |
| 2017 | Female | 74.62202 | 7.952263 | 2.514018 | 1.917295 |
| 2017 | Male | 69.48988 | 7.622961 | 9.273512 | 6.321538 |
| 2018 | Female | 74.85833 | 7.835092 | 2.507702 | 1.912398 |
| 2018 | Male | 69.70655 | 7.547424 | 9.257309 | 6.312783 |
| 2019 | Female | 75.05893 | 7.734047 | 2.507702 | 1.912398 |
| 2019 | Male | 69.90417 | 7.478139 | 9.257309 | 6.312783 |
acle <- acle %>%
group_by(Sex) %>%
mutate(
consumption_level = cut(Liter.per.capita,
breaks = quantile(Liter.per.capita, probs = c(0, 0.25, 0.75, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
)
acle %>%
group_by(consumption_level, Sex) %>%
summarise(
Min_Consumption = min(Liter.per.capita, na.rm = TRUE),
Max_Consumption = max(Liter.per.capita, na.rm = TRUE),
N = n()
)
## `summarise()` has grouped output by 'consumption_level'. You can override using
## the `.groups` argument.
## # A tibble: 6 × 5
## # Groups: consumption_level [3]
## consumption_level Sex Min_Consumption Max_Consumption N
## <fct> <chr> <dbl> <dbl> <int>
## 1 Low Female 0 0.71 844
## 2 Low Male 0 3.29 841
## 3 Medium Female 0.72 3.95 1667
## 4 Medium Male 3.31 14.2 1668
## 5 High Female 3.96 9.69 835
## 6 High Male 14.2 31.4 837
# check dimensions
dim(acle)
## [1] 6692 15
# Variable types
str(acle)
## gropd_df [6,692 × 15] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ WHO.Region.Code : chr [1:6692] "SEAR" "SEAR" "EMR" "EMR" ...
## $ WHO.Region : chr [1:6692] "South-East Asia" "South-East Asia" "Eastern Mediterranean" "Eastern Mediterranean" ...
## $ Country.Code : chr [1:6692] "BGD" "BGD" "KWT" "KWT" ...
## $ Country : chr [1:6692] "Bangladesh" "Bangladesh" "Kuwait" "Kuwait" ...
## $ Year : int [1:6692] 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ Sex : chr [1:6692] "Female" "Male" "Female" "Male" ...
## $ Liter.per.capita : num [1:6692] 0 0 0 0 0 0 0 0 0 0 ...
## $ Liter.per.capita.low : num [1:6692] 0 0 0 0 0 0 0 0 0 0 ...
## $ Liter.per.capita.high : num [1:6692] 0 0 0 0 0 0 0 0 0 0 ...
## $ Liter.per.capita.string: chr [1:6692] "0 [0 – 0]" "0 [0 – 0]" "0 [0 – 0]" "0 [0 – 0]" ...
## $ Code : chr [1:6692] "BGD" "BGD" "KWT" "KWT" ...
## $ Population : num [1:6692] 1.66e+08 1.66e+08 4.44e+06 4.44e+06 4.38e+06 ...
## $ Continent : chr [1:6692] "" "" "" "" ...
## $ Life.expectancy : num [1:6692] 75.1 70.7 82.3 78.2 67.7 63.8 78.9 76.1 59.1 55.1 ...
## $ consumption_level : Factor w/ 3 levels "Low","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "groups")= tibble [2 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Sex : chr [1:2] "Female" "Male"
## ..$ .rows: list<int> [1:2]
## .. ..$ : int [1:3346] 1 3 5 7 9 11 12 13 15 18 ...
## .. ..$ : int [1:3346] 2 4 6 8 10 14 16 17 30 36 ...
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
# Look at the distributions of Sex and consumption against Sex
count(acle, Sex)
## # A tibble: 2 × 2
## # Groups: Sex [2]
## Sex n
## <chr> <int>
## 1 Female 3346
## 2 Male 3346
acle %>% group_by(Sex) %>% count(consumption_level)
## # A tibble: 6 × 3
## # Groups: Sex [2]
## Sex consumption_level n
## <chr> <fct> <int>
## 1 Female Low 844
## 2 Female Medium 1667
## 3 Female High 835
## 4 Male Low 841
## 5 Male Medium 1668
## 6 Male High 837
# Look at the two key variables (Life expectancy and Liters per capita) faceted by Sex
acle %>%
ggplot(aes(x = Liter.per.capita, fill = Sex)) +
geom_histogram(position = "identity", binwidth = 0.5, alpha = 0.3, color = "white") +
scale_fill_manual(values = c("Male" = "red", "Female" = "blue")) +
labs(title = "Distribution of Alcohol Consumption by Sex",
x = "Liter per Capita",
y = "Count") +
theme_minimal() +
guides(fill = guide_legend(title = "Sex"))
acle %>%
ggplot(aes(x = Life.expectancy, fill = Sex)) +
geom_histogram(position = "identity", binwidth = 1, alpha = 0.3, color = "white") +
scale_fill_manual(values = c("Male" = "red", "Female" = "blue")) +
labs(title = "Distribution of Life Expectancy by Sex",
x = "Life Expectancy",
y = "Count") +
theme_minimal() +
guides(fill = guide_legend(title = "Sex"))
In the above chunk, I performed steps 3,4,5 of the EDA checklist:
checked the dimensions of the data, the variable types, and took a
closer look at the distributions at some of the variables.
summary_by_consumption <- acle %>%
group_by(consumption_level) %>%
summarise(
Mean_Life_Expectancy = mean(Life.expectancy, na.rm = TRUE),
SD_Life_Expectancy = sd(Life.expectancy, na.rm = TRUE),
Min_Life_Expectancy = min(Life.expectancy, na.rm = TRUE),
Max_Life_Expectancy = max(Life.expectancy, na.rm = TRUE),
N = n()
)
summary_by_consumption
## # A tibble: 3 × 6
## consumption_level Mean_Life_Expectancy SD_Life_Expectancy Min_Life_Expectancy
## <fct> <dbl> <dbl> <dbl>
## 1 Low 67.4 8.05 47.8
## 2 Medium 67.9 9.62 40.7
## 3 High 75.7 7.91 45.3
## # ℹ 2 more variables: Max_Life_Expectancy <dbl>, N <int>
In the above chunk, I conducted some summary statistics to answer the questions of interest.
ggplot(acle, aes(x = Liter.per.capita, y = Life.expectancy)) +
geom_point(aes(color = Sex), alpha = 0.5) +
labs(title = "Life Expectancy vs. Alcohol Consumption",
x = "Alcohol Consumption (Liters per Capita)",
y = "Life Expectancy") +
theme_minimal()
ggplot(acle, aes(x = Year, y = Life.expectancy, group = Country, color = Country)) +
geom_line(alpha = 0.5) +
facet_wrap(~Sex) +
labs(title = "Life Expectancy Over Time by Country, Faceted by Sex",
x = "Year",
y = "Life Expectancy") +
theme_minimal() +
guides(color = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(acle, aes(x = Year, y = Liter.per.capita, group = Country, color = Country)) +
geom_line(alpha = 0.5) +
facet_wrap(~Sex) +
labs(title = "Alcohol Consumption Over Time by Country, Faceted by Sex",
x = "Year",
y = "Alcohol Consumption (Liter per Capita)") +
theme_minimal() +
guides(color = FALSE)
In the above chunk I made some exploratory graphs on the 2 key variables to answer the 3 questions of interest.
acle %>%
ggplot(aes(x = Liter.per.capita, fill = Sex)) +
geom_histogram(position = "identity", binwidth = 0.5, alpha = 0.3, color = "white") +
scale_fill_manual(values = c("Male" = "red", "Female" = "blue")) +
labs(title = "Distribution of Alcohol Consumption by Sex",
x = "Liter per Capita",
y = "Count") +
theme_minimal() +
guides(fill = guide_legend(title = "Sex"))
From the above graph, we can see the male alcohol consumption
distribution is spread almost evenly from 0 to 30 liters per capita,
while female consumption is mostly close to 0 with highest at 8. It
shows that males consume alcohol significantly more than female.
acle %>%
filter(Year %in% c(2000, 2010, 2019)) %>%
ggplot(aes(x = Liter.per.capita, y = Life.expectancy, color = Sex)) +
geom_point(alpha=0.5) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~Year) +
labs(title = "Life Expectancy vs. Alcohol Consumption for 2000, 2010, 2019",
x = "Liter per Capita",
y = "Life Expectancy") +
scale_color_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal() +
guides(color = guide_legend(title = "Sex"))
## `geom_smooth()` using formula = 'y ~ x'
From the above graph we can see, for all three year, life expectancy
seems to be positively correlated with liter per capita, for both male
and female. Female correlation seems be be stronger i.e. the slope of
the regression line seems to be steeper. From 2000 to 2010 to 2019, the
average life expectancy seems to overall increase.
lm_canada <- lm(Life.expectancy ~ Year + Sex, data = filter(acle, Country == "Canada"))
lm_second_country <- lm(Life.expectancy ~ Year + Sex, data = filter(acle, Country == "Nepal"))
summary(lm_canada)
##
## Call:
## lm(formula = Life.expectancy ~ Year + Sex, data = filter(acle,
## Country == "Canada"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53566 -0.17599 0.03276 0.18809 0.46039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.621e+02 1.472e+01 -17.81 <2e-16 ***
## Year 1.718e-01 7.326e-03 23.46 <2e-16 ***
## SexMale -4.465e+00 8.449e-02 -52.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2672 on 37 degrees of freedom
## Multiple R-squared: 0.9891, Adjusted R-squared: 0.9885
## F-statistic: 1672 on 2 and 37 DF, p-value: < 2.2e-16
summary(lm_second_country)
##
## Call:
## lm(formula = Life.expectancy ~ Year + Sex, data = filter(acle,
## Country == "Nepal"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05500 -0.20803 0.05579 0.31803 0.71289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -601.1608 25.5123 -23.56 <2e-16 ***
## Year 0.3332 0.0127 26.24 <2e-16 ***
## SexMale -3.5500 0.1464 -24.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.463 on 37 degrees of freedom
## Multiple R-squared: 0.9718, Adjusted R-squared: 0.9703
## F-statistic: 638.3 on 2 and 37 DF, p-value: < 2.2e-16
The effects of Sex and Year on Life expectancy are both significant, as we can conclude from the p-values being extremely small. For Canada, the average increase each year of life expectancy is 0.17 years. And the difference between mean male and female life expectancies is -4.65 years. Nepal has similar statistics, with a slightly smaller effect of Sex and slightly larger effect of Year. The R-squared of both models are more than 0.97, implying goodness of fit of the model.
top_discrepancies <- acle %>% select(c(Year, Country, Sex, Life.expectancy)) %>%
filter(Year %in% c(2000, 2019)) %>%
pivot_wider(names_from = Sex, values_from = Life.expectancy) %>%
mutate(Discrepancy = abs(Male - Female)) %>%
arrange(desc(Discrepancy)) %>%
group_by(Year) %>%
slice_max(order_by = Discrepancy, n = 10) %>%
ungroup()
acle_filtered <- acle %>%
inner_join(top_discrepancies %>% select(Year, Country), by = c("Year", "Country"))
ggplot(acle_filtered, aes(x = Country, y = Life.expectancy, fill = Sex)) +
geom_bar(stat = "identity", position = "dodge", alpha=0.5, color="white") +
facet_wrap(~Year, scales = "free_x") +
labs(title = "Life Expectancies of Countries with Top 10 Discrepancies by Sex",
x = "Country",
y = "Life Expectancy") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red"))
From the above graph we see that the top countries with top 10 discrepancies in 2000 and 2019 partially overlap, and most of them are in Europe.
acle %>%
filter(Year == 2019) %>%
ggplot(aes(x = consumption_level, y = Life.expectancy, fill = Sex)) +
geom_boxplot(alpha=0.5) +
facet_wrap(~Sex, scales = "free") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
labs(title = "Life Expectancy by Alcohol Consumption Level and Sex in 2019",
x = "Alcohol Consumption Level",
y = "Life Expectancy",
fill = "Sex") +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1))
As we can see in the plot, life expectancy for high alcohol consumers
are significantly higher than medium and low consumers for both sex. Low
and medium alcohol consumers have generally the same life
expectancies.
acle_summary <- acle %>%
group_by(Year, Sex, consumption_level) %>%
summarise(
Avg_Life_Expectancy = mean(Life.expectancy, na.rm = TRUE),
SEM_Life_Expectancy = sd(Life.expectancy, na.rm = TRUE) / sqrt(n()), # Calculating SEM
Avg_Alcohol_Consumption = mean(Liter.per.capita, na.rm = TRUE),
.groups = 'drop'
)
ggplot(acle_summary, aes(x = Year, y = Avg_Life_Expectancy, group = consumption_level)) +
geom_ribbon(aes(ymin = Avg_Life_Expectancy - SEM_Life_Expectancy, ymax = Avg_Life_Expectancy + SEM_Life_Expectancy, fill = consumption_level),
alpha = 0.1) +
geom_line(aes(color = consumption_level), size = 0.8) +
facet_wrap(~Sex) +
labs(title = "Life Expectancy Over Time by Sex and Alcohol Consumption (with Error Bars)",
x = "Year",
y = "Average Life Expectancy") +
theme_minimal() +
scale_color_manual(values = c("Low" = "#1f77b4", "Medium" = "#ff7f0e", "High" = "#2ca02c"), guide = FALSE) +
scale_fill_manual(values = c("Low" = "#1f77b4", "Medium" = "#ff7f0e", "High" = "#2ca02c"),
name = "Alcohol Consumption Level") +
theme(
legend.title = element_blank(),
legend.position = "bottom",
panel.spacing = unit(1, "lines"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
As we can see on the above graph, the life expectancy of both male and
female and all alcohol consumption levels increase over the years.
Particularly, the alcohol consumption level high category has a overall
higher life expectancy, even though the life expectancy of male high
consumption group oscillates relatively strongly throughout the
years.
acle <- acle %>%
mutate(Log_Population = log(Population))
linear_model <- lm(Life.expectancy ~ Liter.per.capita + Log_Population + Year + Sex, data = acle)
summary(linear_model)
##
## Call:
## lm(formula = Life.expectancy ~ Liter.per.capita + Log_Population +
## Year + Sex, data = acle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.595 -5.432 2.045 6.364 16.101
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -582.82875 36.40143 -16.011 <2e-16 ***
## Liter.per.capita 0.54443 0.02114 25.755 <2e-16 ***
## Log_Population -0.11594 0.04795 -2.418 0.0156 *
## Year 0.32618 0.01812 17.997 <2e-16 ***
## SexMale -8.75392 0.25316 -34.579 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.536 on 6687 degrees of freedom
## Multiple R-squared: 0.1919, Adjusted R-squared: 0.1914
## F-statistic: 397.1 on 4 and 6687 DF, p-value: < 2.2e-16
spline_model <- gam(Life.expectancy ~ Liter.per.capita + s(Log_Population) + Year + Sex, data = acle)
summary(spline_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Life.expectancy ~ Liter.per.capita + s(Log_Population) + Year +
## Sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -594.47343 34.87505 -17.05 <2e-16 ***
## Liter.per.capita 0.52243 0.02032 25.71 <2e-16 ***
## Year 0.33111 0.01735 19.08 <2e-16 ***
## SexMale -8.60478 0.24260 -35.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Log_Population) 8.985 9 69.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.26 Deviance explained = 26.1%
## GCV = 66.855 Scale est. = 66.726 n = 6692
As we can see from the linear model output, the model is not really a good fit according to the adjusted R-squared. Only 19 percent of the variation in the data are explained through the four predictors. Three of the predictors for life expectancy—liter per capita, year, and sex—all showed great significance in the model. Logged population, however, seems to be less significant in predicting life expectancy.
In particular, the model predicts that \[\hat{y}=-594.473+0.544x_1-0.116x_2+0.326x_3-8.75x_4\] where \(y\) is life expectancy, \(x_1\) is liter per capita, \(x_2\) is logged population, \(x_3\) is year, and \(x_4\) indicates whether it is for male.
In the spline model we see better findings. The equation in this model, \[\hat{y} = -594.473 + 0.522x_1 + 0.331x_3 - 8.605x_4 + f(x_2)\], shows alcohol consumption (\(x_1\)) and year (\(x_3\)) positively influencing life expectancy, with coefficients of 0.522 and 0.331 respectively. The variable \(x_4\), indicating male sex, negatively impacts life expectancy with a coefficient of -8.605. The non-linear relationship between life expectancy and logged population (\(x_2\)) is modeled by the spline \(f(x_2)\), highlighting the complex effect of population size, and is statistically significant with an edf of 8.985, \(p < 2e-16\).
The adjusted R-squared value of 0.26 indicates that while the model captures a significant portion of variability in life expectancy, a considerable amount of variance is unexplained, suggesting the potential influence of other factors not included in the model.
plot(spline_model, pages = 1)
The x-axis represents the logged population values. And the y-axis shows the estimated effect (smooth term) of the log-transformed population on life expectancy. The values are on a scale that relates to the contribution of the log population to the predicted life expectancy, after accounting for the effects of other variables in the model.
The solid wiggly line in the middle represents the estimated smooth effect of population on life expectancy. The wiggles in the line suggest a non-linear relationship; as the log population increases, the effect on life expectancy goes through a series of peaks and troughs rather than a straight line. Overall, the graph suggests that population size has a complex and potentially periodic effect on life expectancy, and that this effect varies in a non-linear way across the range of population sizes represented in the data.
The dashed lines on either side of the estimated smooth represent the confidence interval around the estimate. The confidence is high where there are a lot of present data, but not at the two ends of logged population.
acle_agg <- acle %>%
group_by(Country, Sex) %>%
summarise(
Avg_Life_Expectancy = mean(Life.expectancy, na.rm = TRUE),
Avg_Liter_per_Capita = mean(Liter.per.capita, na.rm = TRUE),
Log_Population = mean(Log_Population, na.rm = TRUE),
.groups = 'keep'
) %>%
ungroup()
linear_model_agg <- lm(Avg_Life_Expectancy ~ Avg_Liter_per_Capita + Sex + Log_Population, data = acle_agg)
summary(linear_model_agg)
##
## Call:
## lm(formula = Avg_Life_Expectancy ~ Avg_Liter_per_Capita + Sex +
## Log_Population, data = acle_agg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.950 -5.456 1.972 6.339 15.364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.70208 3.36074 21.633 < 2e-16 ***
## Avg_Liter_per_Capita 0.56959 0.09507 5.991 5.42e-09 ***
## SexMale -8.94389 1.12489 -7.951 2.93e-14 ***
## Log_Population -0.12291 0.21182 -0.580 0.562
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.432 on 332 degrees of freedom
## Multiple R-squared: 0.1681, Adjusted R-squared: 0.1606
## F-statistic: 22.37 on 3 and 332 DF, p-value: 3.248e-13
spline_model_agg <- gam(Avg_Life_Expectancy ~ Avg_Liter_per_Capita + Sex + s(Log_Population), data = acle_agg)
summary(spline_model_agg)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Avg_Life_Expectancy ~ Avg_Liter_per_Capita + Sex + s(Log_Population)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.86046 0.66972 105.805 < 2e-16 ***
## Avg_Liter_per_Capita 0.54704 0.09196 5.948 7.01e-09 ***
## SexMale -8.79039 1.08521 -8.100 1.13e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Log_Population) 8.704 8.975 3.616 0.000541 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.221 Deviance explained = 24.6%
## GCV = 68.385 Scale est. = 66.003 n = 336
The linear model summary for average life expectancy over the years revealed a significant relationship with both average alcohol consumption per capita and sex, while the logged population is not a significant predictor. Specifically, the model estimates an average life expectancy, \[\hat{y}=72.702 + 0.570x_1 - 8.944x_2 - 0.123x_3\] indicating that as average alcohol consumption increases by one unit, life expectancy is predicted to increase by approximately 0.570 years, and that being male has a substantial negative impact on life expectancy, by about 8.944 years compared to females. The variable \(x_3\), representing the log of population size, shows a negative coefficient of -0.123, but this relationship is not statistically significant (p = 0.562).
The model’s overall fit is not adequate, with an adjusted R-squared of 0.1606, meaning it explains merely 16.06% of the variability in average life expectancy across countries. However, the F-statistic is 22.37 with a highly significant p-value (less than 0.001), indicating the model’s predictors, as a whole, are statistically significant.
The GAM examining average life expectancy demonstrates significant effects from average alcohol consumption and sex, alongside a significant non-linear relationship with logged population size. As before in the linear model, the spline model suggests that each additional unit of alcohol consumption per capita is associated with a 0.547-year increase in life expectancy, while being male correlates with an 8.790-year reduction compared to females. The spline component for logged population indicates a non-linear effect with significant complexity, highlighting that the impact of population size on life expectancy is not straightforward.
From the adjusted R-squares, we can tell this model accounts for approximately 22.1% of the variation in life expectancy, marking a notable improvement over linear models without the spline term.
ggplot(acle_agg, aes(x = Avg_Liter_per_Capita, y = Avg_Life_Expectancy, color = Sex)) +
geom_point(alpha=0.5) +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "loess", se = FALSE, linetype = "longdash") +
labs(title = "Average Alcohol Consumption vs. Life Expectancy by Sex",
x = "Average Alcohol Consumption (Liter per Capita)",
y = "Average Life Expectancy") +
scale_color_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal() +
theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Same as the previous exploratory plots, for both males and females, the linear regression lines suggest a positive relationship between alcohol consumption and life expectancy, as the slopes are upward. This indicates that, on average, higher alcohol consumption is associated with higher life expectancy. However, the relationship does not seem to be uniform across the range of alcohol consumption; this non-linearity is captured by the smoothed lines, which bend and change slope at different points.